Author

Maria Knapczyk

1 Intro

Show and hide code
library(tidymodels)
library(tidyverse)
library(stringr)
library(vip)
library(pdp)
library(DALEX)
library(DALEXtra)
library(bestNormalize)
library(rules)
library(baguette)
library(finetune)
library(doParallel)
library(DT)
library(ggplot2)
library(lubridate)
library(future)

tidymodels_prefer()
Show and hide code
data = read.csv("coffee_shop_sales.csv")
datatable(data)

2 Data

Show and hide code
str(data)
'data.frame':   4998 obs. of  15 variables:
 $ transaction_id       : int  1 2 3 4 5 6 7 8 9 10 ...
 $ transaction_date     : chr  "2025-03-13" "2024-09-26" "2025-03-23" "2025-05-28" ...
 $ transaction_time     : chr  "10:10:19" "13:13:41" "16:17:20" "16:31:54" ...
 $ product_id           : chr  "P302" "P302" "P302" "P201" ...
 $ product_name         : chr  "Croissant" "Croissant" "Croissant" "Green Tea" ...
 $ product_category     : chr  "Bakery" "Bakery" "Bakery" "Tea" ...
 $ size                 : chr  NA NA NA "Small" ...
 $ quantity             : int  1 1 2 2 3 1 1 3 3 3 ...
 $ unit_price           : num  3 3 3 2 3 2.5 2 3.5 2.5 3 ...
 $ total_price          : num  3 3 6 4 9 2.5 2 10.5 7.5 9 ...
 $ payment_method       : chr  "Debit Card" "Credit Card" "Cash" "Credit Card" ...
 $ customer_id          : chr  "C624" "C549" "C857" "C402" ...
 $ loyalty_points_earned: int  6 6 0 8 0 3 7 2 7 9 ...
 $ location             : chr  "Airport" "University" "Airport" "Downtown" ...
 $ barista_id           : chr  "EMP001" "EMP001" "EMP002" "EMP004" ...
Show and hide code
data <- data |> 
  as_tibble() |> 
  janitor::clean_names() |> 

  mutate(
    transaction_date = as.Date(transaction_date),
    transaction_datetime = as.POSIXct(
      paste(transaction_date, transaction_time),
      format="%Y-%m-%d %H:%M:%S"
    )
  ) |> 

  mutate(
    hour = hour(transaction_datetime)
  ) |> 

  mutate(
    wday = wday(transaction_date, label = TRUE, abbr = TRUE),
    day_work = if_else(wday %in% c("Sat", "Sun"), "weekend", "week"),
    day_work = factor(day_work)
  )
Show and hide code
data |> summary()
 transaction_id transaction_date     transaction_time    product_id       
 Min.   :   1   Min.   :2024-08-08   Length:4998        Length:4998       
 1st Qu.:1250   1st Qu.:2024-11-07   Class :character   Class :character  
 Median :2500   Median :2025-02-08   Mode  :character   Mode  :character  
 Mean   :2500   Mean   :2025-02-07                                        
 3rd Qu.:3749   3rd Qu.:2025-05-08                                        
 Max.   :4998   Max.   :2025-08-07                                        
                                                                          
 product_name       product_category       size              quantity    
 Length:4998        Length:4998        Length:4998        Min.   :1.000  
 Class :character   Class :character   Class :character   1st Qu.:1.000  
 Mode  :character   Mode  :character   Mode  :character   Median :2.000  
                                                          Mean   :1.986  
                                                          3rd Qu.:3.000  
                                                          Max.   :3.000  
                                                                         
   unit_price     total_price    payment_method     customer_id       
 Min.   :2.000   Min.   : 2.00   Length:4998        Length:4998       
 1st Qu.:2.500   1st Qu.: 3.50   Class :character   Class :character  
 Median :3.000   Median : 6.00   Mode  :character   Mode  :character  
 Mean   :3.096   Mean   : 6.15                                        
 3rd Qu.:3.500   3rd Qu.: 8.00                                        
 Max.   :4.500   Max.   :13.50                                        
                                                                      
 loyalty_points_earned   location          barista_id       
 Min.   : 0.000        Length:4998        Length:4998       
 1st Qu.: 0.000        Class :character   Class :character  
 Median : 1.000        Mode  :character   Mode  :character  
 Mean   : 3.022                                             
 3rd Qu.: 6.000                                             
 Max.   :10.000                                             
                                                            
 transaction_datetime                  hour        wday        day_work   
 Min.   :2024-08-08 12:15:25.00   Min.   : 8.00   Sun:713   week   :3539  
 1st Qu.:2024-11-07 11:53:53.00   1st Qu.:11.00   Mon:682   weekend:1459  
 Median :2025-02-08 10:53:26.00   Median :14.00   Tue:716                 
 Mean   :2025-02-07 15:31:04.34   Mean   :13.93   Wed:687                 
 3rd Qu.:2025-05-08 20:26:52.75   3rd Qu.:17.00   Thu:770                 
 Max.   :2025-08-07 19:48:16.00   Max.   :20.00   Fri:684                 
                                                  Sat:746                 
Show and hide code
colSums(is.na(data))
       transaction_id      transaction_date      transaction_time 
                    0                     0                     0 
           product_id          product_name      product_category 
                    0                     0                     0 
                 size              quantity            unit_price 
                 1464                     0                     0 
          total_price        payment_method           customer_id 
                    0                     0                  1537 
loyalty_points_earned              location            barista_id 
                    0                     0                     0 
 transaction_datetime                  hour                  wday 
                    0                     0                     0 
             day_work 
                    0 
Show and hide code
cor(data[sapply(data, is.numeric)])
                      transaction_id     quantity   unit_price   total_price
transaction_id           1.000000000 -0.003025046  0.013439873 -0.0021041791
quantity                -0.003025046  1.000000000  0.000410798  0.8658484256
unit_price               0.013439873  0.000410798  1.000000000  0.4625576353
total_price             -0.002104179  0.865848426  0.462557635  1.0000000000
loyalty_points_earned   -0.003518060  0.013822510 -0.011421552  0.0003937413
hour                     0.007342483  0.014040713  0.017121467  0.0188011150
                      loyalty_points_earned         hour
transaction_id                -0.0035180599  0.007342483
quantity                       0.0138225098  0.014040713
unit_price                    -0.0114215517  0.017121467
total_price                    0.0003937413  0.018801115
loyalty_points_earned          1.0000000000 -0.008620883
hour                          -0.0086208833  1.000000000

3 Plots

Show and hide code
ggplot(data, aes(x=total_price)) +
  labs(
    title = "Histogram cen",
    x = "Cena",
    y = "Ilość"
  ) + 
  geom_histogram(binwidth = 1, fill = "cadetblue", color = "black")

Show and hide code
ggplot(data, aes(x=loyalty_points_earned)) + 
  labs(
    title = "Histogram zdobytych punktów lojanościowych",
    x = "Przedziały",
    y = "Ilość"
  ) + 
  geom_histogram(binwidth = 1, fill = "cadetblue", color = "black")

Show and hide code
ggplot(data, aes(x = product_category)) +
  geom_bar(fill = "cadetblue", color = "black") +
  labs(
    title = "Liczba transakcji w poszczególnych kategoriach produktów",
    x = "Kategoria produktu",
    y = "Liczba transakcji"
  ) 

Show and hide code
ggplot(data, aes(x=payment_method)) + geom_bar(fill = "cadetblue", color = "black") +
  labs(
    title = "Liczba transakcji poszczególnych metod płatniczych",
    x = "Metoda płatności",
    y = "Liczba transakcji"
  )

Show and hide code
ggplot(data, aes(x=location)) + geom_bar(fill = "cadetblue", color = "black") +
  labs(
    title = "Liczba transakcji w poszczególnych lokalizacjach",
    x = "Lokalizacja",
    y = "Liczba transakcji"
  )

Show and hide code
hourly_products <- data %>%
  group_by(hour, product_name) %>%
  summarise(transactions = n(), .groups = "drop") %>%
  arrange(hour, desc(transactions)) %>%
  group_by(hour) %>%
  slice_max(transactions, n = 3)


ggplot(hourly_products, aes(x = factor(hour), y = transactions, fill = product_name)) +
  geom_col() +
  labs(
    title = "Najczęściej kupowany produkt w każdej godzinie",
    x = "Godzina",
    y = "Liczba transakcji",
    fill = "Produkt"
  )

4 Formula

Show and hide code
forms <- formula(total_price ~ .)

5 Data split

Show and hide code
set.seed(111)

trans_split <- initial_validation_split(data = data, strata = total_price)

trans_train <- training(trans_split)
trans_test  <- testing(trans_split)
trans_valid <- validation_set(trans_split)

trans_folds <- vfold_cv(trans_train, strata = total_price, v = 10, repeats = 5) # testowo

save(trans_split, trans_train, trans_test, trans_valid, trans_folds,file = "coffesplitted_data.Rdata")
Show and hide code
load("coffesplitted_data.Rdata")

6 Model

Show and hide code
# CART
cart_spec <- decision_tree(
  cost_complexity = tune(),
  min_n = tune()
) %>%
  set_engine("rpart") %>%
  set_mode("regression")

# Random Forest
rf_spec <- rand_forest(
  mtry = tune(),
  min_n = tune(),
  trees = tune()
) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# Cubist
cubist_spec <- cubist_rules(
  committees = tune(),
  neighbors = tune()
) %>%
  set_engine("Cubist") %>%
  set_mode("regression")

# XGBoost
xgb_spec <- boost_tree(
  tree_depth     = tune(),
  learn_rate     = tune(),
  loss_reduction = tune(),
  min_n          = tune(),
  sample_size    = tune(),
  trees          = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# MARS
mars_spec <- mars(
  num_terms = tune(),
  prod_degree = tune()
) %>%
  set_engine("earth") %>%
  set_mode("regression")

# knn
knn_spec <- nearest_neighbor(
  neighbors = tune()
) %>%
  set_engine("kknn") %>%
  set_mode("regression")

7 Recipe

Show and hide code
basic_rec <- recipe(total_price ~ ., data = trans_train) %>%
  step_mutate(hour = as.numeric(hour)) %>%
  update_role(
    transaction_id, transaction_date, transaction_time,
    product_id, customer_id, barista_id,
    new_role = "ID"
  ) %>%
  step_rm(quantity, loyalty_points_earned)

basic_t_rec <- recipe(total_price ~ ., data = trans_train) %>%
  update_role(transaction_id, transaction_date, transaction_time,
              product_id, customer_id, barista_id, new_role = "ID") %>%
  step_mutate(
    transaction_datetime = as.numeric(transaction_datetime),
    hour = as.numeric(hour)
  ) %>%
  step_rm(quantity, loyalty_points_earned) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_zv(all_nominal_predictors())

cubist_rec <- recipe(total_price ~ ., data = trans_train) %>%
  update_role(transaction_id, transaction_date, transaction_time,
              product_id, customer_id, barista_id, new_role = "ID") %>%
  step_mutate(
    transaction_datetime = as.numeric(transaction_datetime),
    hour = as.numeric(hour)
  ) %>%
  step_rm(quantity, loyalty_points_earned) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_novel(all_nominal_predictors()) %>%
  step_string2factor(all_nominal_predictors()) %>%
  step_zv(all_predictors())

prep(basic_t_rec, training = trans_train) %>% juice()
# A tibble: 2,997 × 39
   transaction_id transaction_date transaction_time product_id unit_price
            <int> <date>           <chr>            <chr>           <dbl>
 1              1 2025-03-13       10:10:19         P302              3  
 2              5 2025-01-16       19:17:24         P302              3  
 3              7 2024-09-16       20:51:57         P201              2  
 4              8 2024-11-19       14:17:27         P103              3.5
 5             10 2025-04-09       10:34:23         P302              3  
 6             11 2024-12-21       20:42:07         P102              4  
 7             14 2024-10-17       19:22:30         P202              4  
 8             15 2025-02-11       18:53:16         P102              3  
 9             16 2025-05-08       12:16:37         P102              3  
10             18 2024-09-11       15:27:29         P202              4  
# ℹ 2,987 more rows
# ℹ 34 more variables: customer_id <chr>, barista_id <chr>,
#   transaction_datetime <dbl>, hour <dbl>, total_price <dbl>,
#   product_name_Cappuccino <dbl>, product_name_Chai.Latte <dbl>,
#   product_name_Croissant <dbl>, product_name_Espresso <dbl>,
#   product_name_Green.Tea <dbl>, product_name_Latte <dbl>,
#   product_name_unknown <dbl>, product_category_Coffee <dbl>, …
Show and hide code
prep(basic_rec, training = trans_train) |> juice()
# A tibble: 2,997 × 17
   transaction_id transaction_date transaction_time product_id product_name
            <int> <date>           <chr>            <chr>      <fct>       
 1              1 2025-03-13       10:10:19         P302       Croissant   
 2              5 2025-01-16       19:17:24         P302       Croissant   
 3              7 2024-09-16       20:51:57         P201       Green Tea   
 4              8 2024-11-19       14:17:27         P103       Latte       
 5             10 2025-04-09       10:34:23         P302       Croissant   
 6             11 2024-12-21       20:42:07         P102       Cappuccino  
 7             14 2024-10-17       19:22:30         P202       Chai Latte  
 8             15 2025-02-11       18:53:16         P102       Cappuccino  
 9             16 2025-05-08       12:16:37         P102       Cappuccino  
10             18 2024-09-11       15:27:29         P202       Chai Latte  
# ℹ 2,987 more rows
# ℹ 12 more variables: product_category <fct>, size <fct>, unit_price <dbl>,
#   payment_method <fct>, customer_id <chr>, location <fct>, barista_id <chr>,
#   transaction_datetime <dttm>, hour <dbl>, wday <ord>, day_work <fct>,
#   total_price <dbl>
Show and hide code
summary(basic_rec) |> knitr::kable()
variable type role source
transaction_id integer, numeric ID original
transaction_date date ID original
transaction_time string , unordered, nominal ID original
product_id string , unordered, nominal ID original
product_name string , unordered, nominal predictor original
product_category string , unordered, nominal predictor original
size string , unordered, nominal predictor original
quantity integer, numeric predictor original
unit_price double , numeric predictor original
payment_method string , unordered, nominal predictor original
customer_id string , unordered, nominal ID original
loyalty_points_earned integer, numeric predictor original
location string , unordered, nominal predictor original
barista_id string , unordered, nominal ID original
transaction_datetime datetime predictor original
hour integer, numeric predictor original
wday ordered, nominal predictor original
day_work factor , unordered, nominal predictor original
total_price double , numeric outcome original

8 Workflow

Show and hide code
a <- workflow_set(
  preproc = list(b = basic_rec),
  models  = list(
    rpart = cart_spec,
    ranger = rf_spec
  )
)

b <- workflow_set(
  preproc = list(t = basic_t_rec),
  models  = list(xgboost = xgb_spec
  )
)

c <- workflow_set(
  preproc = list(cubist = cubist_rec),
  models = list(cubist = cubist_spec)
)

d <- workflow_set(
  preproc = list(transformed = basic_rec),
  models = list(
    mars = mars_spec,
    knn  = knn_spec
  )
)

basic <- bind_rows(a, b, c, d)
basic$wflow_id <- str_sub(basic$wflow_id, start = 3, end = 100)

basic
# A workflow set/tibble: 6 × 4
  wflow_id       info             option    result    
  <chr>          <list>           <list>    <list>    
1 rpart          <tibble [1 × 4]> <opts[0]> <list [0]>
2 ranger         <tibble [1 × 4]> <opts[0]> <list [0]>
3 xgboost        <tibble [1 × 4]> <opts[0]> <list [0]>
4 bist_cubist    <tibble [1 × 4]> <opts[0]> <list [0]>
5 ansformed_mars <tibble [1 × 4]> <opts[0]> <list [0]>
6 ansformed_knn  <tibble [1 × 4]> <opts[0]> <list [0]>

9 Parameters

Show and hide code
#n nrow i p predyktory

cart_param <- cart_spec |>
  extract_parameter_set_dials() |>
  update(
    min_n = min_n(c(5, 30)) # <1% z n
  )


rf_param <- rf_spec |>
  extract_parameter_set_dials() |> 
  update(
    min_n = min_n(c(5, 30)),
    mtry =  mtry(c(4, 9)),   # sqrt i polowa p
    trees = trees(c(100, 500))
  )


xgb_param <- xgb_spec |> 
  extract_parameter_set_dials() |> 
  update(
    min_n = min_n(c(5, 30)), 
    trees = trees(c(100, 500)), 
    tree_depth = tree_depth(c(2,10)),
    learn_rate = learn_rate(c(0.01, 0.3)),
    loss_reduction = loss_reduction(c(0, 5)),
    sample_size = sample_prop(c(0.5, 1))
  )


cubist_param <- cubist_spec |> 
  extract_parameter_set_dials() |> 
  update(
    committees = committees(c(1, 10)),
    neighbors  = neighbors(c(0, 5))
  )


mars_param <- mars_spec |>
  extract_parameter_set_dials() |>
  update(
    num_terms = num_terms(c(2, 30)),
    prod_degree = prod_degree(c(1, 2))
  )


knn_param <- knn_spec |>
  extract_parameter_set_dials() |>
  update(
    neighbors = neighbors(c(3, 15))
  )



basic <- basic |> option_add(param_info = cart_param, id = "rpart")
basic <- basic |> option_add(param_info = rf_param, id = "ranger")
basic <- basic |> option_add(param_info = xgb_param, id = "xgboost")
basic <- basic |> option_add(param_info = cubist_param, id = "cubist")
basic <- basic |> option_add(param_info = mars_param, id = "mars")
basic <- basic |> option_add(param_info = knn_param, id = "knn")


basic
# A workflow set/tibble: 6 × 4
  wflow_id       info             option    result    
  <chr>          <list>           <list>    <list>    
1 rpart          <tibble [1 × 4]> <opts[1]> <list [0]>
2 ranger         <tibble [1 × 4]> <opts[1]> <list [0]>
3 xgboost        <tibble [1 × 4]> <opts[1]> <list [0]>
4 bist_cubist    <tibble [1 × 4]> <opts[0]> <list [0]>
5 ansformed_mars <tibble [1 × 4]> <opts[0]> <list [0]>
6 ansformed_knn  <tibble [1 × 4]> <opts[0]> <list [0]>
Show and hide code
basic |>
  split(~wflow_id) |>
  map(
    \(x) extract_parameter_set_dials(
      x = x,
      id = x$wflow_id
    ) |>
      _$object
  )
$ansformed_knn
$ansformed_knn[[1]]


$ansformed_mars
$ansformed_mars[[1]]

$ansformed_mars[[2]]


$bist_cubist
$bist_cubist[[1]]

$bist_cubist[[2]]


$ranger
$ranger[[1]]

$ranger[[2]]

$ranger[[3]]


$rpart
$rpart[[1]]

$rpart[[2]]


$xgboost
$xgboost[[1]]

$xgboost[[2]]

$xgboost[[3]]

$xgboost[[4]]

$xgboost[[5]]

$xgboost[[6]]

10 Grid and tune

Show and hide code
race_ctrl <- control_race(
  save_pred = TRUE,
  parallel_over = "everything",
  save_workflow = FALSE,
  verbose = TRUE
)
Show and hide code
cores <- parallel::detectCores(logical = FALSE)
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)

race_models <- basic |> 
  filter(wflow_id %in% c("rpart", "ranger", "bist_cubist", "xgboost"))

time_race <- Sys.time()
race_result <- workflow_map(
  race_models,
  "tune_race_anova",
  seed = 111,
  resamples = trans_folds,
  grid = 50,
  control = race_ctrl,
  verbose = TRUE,
  metrics = metric_set(rmse, mae, rsq)
)

Sys.time() - time_race
Time difference of 7.449147 mins
Show and hide code
grid_ctrl <- control_grid(
  save_pred = TRUE,
  parallel_over = "everything"
)

# workflow_set już masz
grid_models <- workflow_set(
  preproc = list(transformed = basic_t_rec),
  models = list(
    mars = mars_spec,
    knn  = knn_spec
  )
)

# strojenie wszystkich workflowów naraz
grid_result <- workflow_map(
  grid_models,
  "tune_grid",
  resamples = trans_folds,
  grid = 20,
  metrics = metric_set(rmse, mae, rsq),
  control = grid_ctrl
)


stopCluster(cl)

save(race_result, grid_result, file = "tune_results_split.Rdata")

11 Best model selection

Show and hide code
load("tune_results_split.Rdata")
combined_results <- bind_rows(race_result, grid_result)

best_results <-
  combined_results |>
  split(~wflow_id) |>
  map(
    \(x)
      extract_workflow_set_result(x = x, id = x$wflow_id) |>
      select_best(metric = "rmse", )
  )

best_models <-
  combined_results |>
  split(~wflow_id) |>
  map(
    \(x) 
      extract_workflow(x = x, id = x$wflow_id) |>
      finalize_workflow(best_results[[x$wflow_id]]) |>
      last_fit(
        split = trans_split, 
        metrics = metric_set(rmse, rsq, mae)
        )
  )

save(best_models, file = "best_models.rdata")
Show and hide code
load("tune_result.Rdata")
load("best_models.rdata")

12 Result tune

Show and hide code
combined_results |>
  split(~wflow_id) |>
  map(
    \(x) 
      extract_workflow_set_result(x = x, id = x$wflow_id) |> 
      show_best(metric = "rmse", n = 1) |> 
      select(-n, -.metric, -.config)
  ) |>
  knitr::kable()
committees neighbors .estimator mean std_err
91 0 standard 2.592486 0.0065457
mtry trees min_n .estimator mean std_err
4 157 25 standard 2.650011 0.0084823
cost_complexity min_n .estimator mean std_err
0.0051795 21 standard 2.60576 0.0073104
neighbors .estimator mean std_err
15 standard 2.769503 0.0094328
num_terms prod_degree .estimator mean std_err
3 2 standard 2.594893 0.0067205
trees min_n tree_depth learn_rate loss_reduction sample_size .estimator mean std_err
108 9 5 1.095448 222.2996 0.8061224 standard 2.629619 0.0078156
Show and hide code
combined_results |> 
  rank_results(select_best = T) |> 
  unite("rate", c("mean", "std_err"), sep = "/") |> 
  pivot_wider(names_from = .metric, values_from = rate) |>
  separate_wider_delim(
    cols = mae:rsq, 
    delim = "/", 
    names = c("", "_std_err"), 
    names_sep = "") |> 
  select(-preprocessor, -n, -model) |>
  mutate(.config = str_sub(.config, 20, 30)) |> 
  mutate(across(
    .cols = mae:rsq_std_err, 
    .fns = \(x) signif(x = as.numeric(x), digits = 3)
  )) |> arrange(mae) |> 
  gt::gt() |> 
  gt::tab_header(title = "Wyniki oceny dla zestawu walidacyjnego")
Wyniki oceny dla zestawu walidacyjnego
wflow_id .config rank mae mae_std_err rmse rmse_std_err rsq rsq_std_err
bist_cubist 46 1 2.07 0.00903 2.59 0.00655 0.220 0.00522
transformed_mars 6 2 2.09 0.00888 2.59 0.00672 0.218 0.00520
rpart 43 3 2.10 0.00951 2.61 0.00731 0.212 0.00533
xgboost 09 4 2.18 0.00900 2.63 0.00782 0.200 0.00533
ranger 07 5 2.21 0.00907 2.65 0.00848 0.188 0.00506
transformed_knn 14 6 2.32 0.00921 2.77 0.00943 0.138 0.00460
Show and hide code
combined_results |>  
  rank_results(select_best = T) |>
  mutate(wflow_id = fct_reorder(wflow_id, rank)) |> 
  ggplot(aes(wflow_id, mean, colour = wflow_id)) +
  geom_point() +
  geom_errorbar(aes(ymin = mean - 1.96 * std_err,
                    ymax = mean + 1.96 * std_err), width = 0.8) +
  facet_wrap(~.metric, scales = "free_y") +
  theme_bw() + 
  labs(x = "model", 
       y = "value metric", 
       colour = "model") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Show and hide code
ggsave(filename = "rys. 3._valid_result_model.jpeg", device = "jpeg", 
       width = 8.5, height = 3, dpi = 300)
Show and hide code
all_fit_metrics <-
  combined_results |>
  split(~wflow_id) |>
  map(
    \(x)
    extract_workflow_set_result(x = x, id = x$wflow_id) |>
      _$.metrics |>
      _[[1]] |> 
      mutate(.config = str_sub(.config, 20, 24))
  )    

12.1 Ranger

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "ranger") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric mtry trees min_n .config
1 4 279 24 Preprocessor1_Model02
2 4 157 25 Preprocessor1_Model07
3 4 157 25 Preprocessor1_Model07
Show and hide code
all_fit_metrics[["ranger"]] |>                              # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(trees, .estimate, color = factor(min_n))) +
  geom_point() +
  facet_wrap(~mtry) +
  scale_x_continuous(limits = c(0, 600), breaks = seq(0, 600, 100)) +
  ggtitle(label = "ranger") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Najlepszy mtry = 4 300/400 drzew jest wystarczjaące Najniższe RMSE dla sporej liczby min_n - koło 25, drzewa płytsze i zgeneralizowane

Show and hide code
all_fit_metrics[["ranger"]] |> 
  filter(.metric == "rmse", .estimate < 2.74) |> 
  arrange(.estimate) |> 
  gt::gt()
mtry trees min_n .metric .estimator .estimate .config

Najlepsze parametry z config 07

12.2 Cubist

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "bist_cubist") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric committees neighbors .config
1 91 0 Preprocessor1_Model46
2 91 0 Preprocessor1_Model46
3 91 0 Preprocessor1_Model46
Show and hide code
all_fit_metrics[["bist_cubist"]] |>                               # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(committees, .estimate, color = factor(neighbors))) +
  geom_point(position = position_jitter(width = 0.5, height = 0)) +
  facet_wrap(~neighbors, scales = "free_y") +
  scale_x_continuous(limits = c(0, 120), breaks = seq(0, 120, 20), expand = c(0, 0)) +
  ggtitle(label = "Cubist") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Najlepsze neighbors 0 i dla najwiekszych im zwiekszam tym sie rmse zmniejsza.

Show and hide code
all_fit_metrics[["bist_cubist"]] |>                               # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  filter(between(neighbors, 7, 9)) |> 
  ggplot(aes(committees, .estimate, color = factor(neighbors))) +
  geom_point() +
  geom_line() +
  scale_x_continuous(limits = c(0, 120), breaks = seq(0, 120, 20), expand = c(0, 0)) +
  ggtitle(label = "Cubist")

Show and hide code
all_fit_metrics[["bist_cubist"]] |> 
  filter(.metric == "rmse", .estimate < 2.86) |> 
  arrange(.estimate) |> 
  gt::gt()
committees neighbors .metric .estimator .estimate .config
91 0 rmse standard 2.704234 46
77 0 rmse standard 2.704234 39
53 0 rmse standard 2.704235 27
37 0 rmse standard 2.704237 19
11 0 rmse standard 2.704248 06
9 9 rmse standard 2.827731 05
23 9 rmse standard 2.827732 12
47 9 rmse standard 2.827733 24
63 9 rmse standard 2.827733 32
89 9 rmse standard 2.827733 45
21 8 rmse standard 2.848578 11
35 8 rmse standard 2.848578 18
61 8 rmse standard 2.848579 31
75 8 rmse standard 2.848579 38
87 8 rmse standard 2.848579 44
7 7 rmse standard 2.856703 04
33 7 rmse standard 2.856706 17
49 7 rmse standard 2.856706 25
73 7 rmse standard 2.856706 37
100 7 rmse standard 2.856707 50

Dla małych neighbors dodatkowe komisje nie mają sensu, model jest już optymalny. Dla większych neighbors dodanie komisji nieznacznie zmienia RMSE, efekt jest minimalny.

12.3 Rpart

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "rpart") |> 
      select_best(metric = x) |>
      mutate(.metric = x)) |> 
  gt::gt()
cost_complexity min_n .config .metric
0.005179475 21 Preprocessor1_Model43 mae
0.005179475 21 Preprocessor1_Model43 rmse
0.005179475 21 Preprocessor1_Model43 rsq
Show and hide code
all_fit_metrics[["rpart"]] |>                                # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(cost_complexity, .estimate, color = factor(min_n))) +
  geom_point() +
  geom_line() +
#  facet_wrap(~min_n) +
  scale_x_log10(breaks = breaks_log(n = 10, base = 10), labels = label_log(base = 10)) +
  labs(title =  "rpart", color = "min_n", y = "rmse")

Show and hide code
all_fit_metrics[["rpart"]] |>
  filter(.metric == "rmse") |>
  slice_min(.estimate, n = 3)
# A tibble: 3 × 6
  cost_complexity min_n .metric .estimator .estimate .config
            <dbl> <int> <chr>   <chr>          <dbl> <chr>  
1         0.00518    21 rmse    standard        2.70 43     
2         0.00339    17 rmse    standard        2.72 42     
3         0.00791    25 rmse    standard        2.73 44     

12.4 xgboost

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
    combined_results |> 
      extract_workflow_set_result(id = "xgboost") |> 
      select_best(metric = x) |>
      mutate(.metric = x)) |> 
  gt::gt()
trees min_n tree_depth learn_rate loss_reduction sample_size .config .metric
108 9 5 1.095448 222.2996 0.8061224 Preprocessor1_Model09 mae
108 9 5 1.095448 222.2996 0.8061224 Preprocessor1_Model09 rmse
108 9 5 1.095448 222.2996 0.8061224 Preprocessor1_Model09 rsq
Show and hide code
all_fit_metrics[["xgboost"]] |>                                # Zmień model ...
  filter(.metric == "rmse") |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(trees, .estimate, color = factor(min_n))) +
  geom_point() +
  facet_wrap(~tree_depth, scales = "free_y") +
  labs(title =  "xgboost", color = "min_n", y = "rmse")

rmse <2.8

Show and hide code
all_fit_metrics[["xgboost"]] |>                                # Zmień model ...
  filter(.metric == "rmse", .estimate < 2.8) |>                               # Zmień statystykę rsq, mae 
  ggplot(aes(trees, .estimate, color = factor(min_n))) +
  geom_point() +
  facet_wrap(~tree_depth) +
  labs(title =  "xgboost", color = "min_n", y = "rmse")

min_n 9 oraz depth 5

12.5 Transformed mars

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
      combined_results |> 
      extract_workflow_set_result(id = "transformed_mars") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric num_terms prod_degree .config
1 3 1 Preprocessor1_Model2
2 3 2 Preprocessor1_Model6
3 3 2 Preprocessor1_Model6
Show and hide code
all_fit_metrics[["transformed_mars"]] |>                              
  filter(.metric == "rmse") |>                              
  ggplot(aes(num_terms, .estimate, color = factor(prod_degree))) + 
  geom_point() +
  geom_line() +
  facet_wrap(~prod_degree, scales = "free_y") +
  ggtitle(label = "transformed_mars") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

minimalne różnice

12.6 knn

Show and hide code
c("mae", "rmse", "rsq") |> 
  map_dfr(
    \(x) 
      combined_results |> 
      extract_workflow_set_result(id = "transformed_knn") |> 
      select_best(metric = x), .id = ".metric") |> 
  gt::gt()
.metric neighbors .config
1 15 Preprocessor1_Model14
2 15 Preprocessor1_Model14
3 15 Preprocessor1_Model14
Show and hide code
colnames(all_fit_metrics[["transformed_knn"]])
[1] "neighbors"  ".metric"    ".estimator" ".estimate"  ".config"   
Show and hide code
all_fit_metrics[["transformed_knn"]] |>                              
  filter(.metric == "rmse") |>                              
  ggplot(aes(neighbors, .estimate, color = factor(neighbors))) +  
  geom_point() +
  geom_line(aes(group = 1)) + 
  ggtitle(label = "transformed_knn") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

RMSE maleje wraz ze wzrostem sasiadow

Show and hide code
all_fit_metrics[["transformed_knn"]] |> 
  filter(.metric == "rmse", .estimate < 2.87) |> 
  arrange(.estimate) |> 
  gt::gt()
neighbors .metric .estimator .estimate .config
15 rmse standard 2.805569 14
14 rmse standard 2.818775 13
12 rmse standard 2.849140 12
11 rmse standard 2.867958 11

Każda z opcji jest dobra - małe różnice

13 Result best models

Show and hide code
p <-
  best_models |>
  map_dfr(.f = \(x) collect_metrics(x = x), .id = "mod") |>
  mutate(mod = factor(mod, levels = c("rpart", "ranger", "bist_cubist", "xgboost", "transformed_mars", "transformed_knn"))) |>
  ggplot(aes(mod, .estimate, colour = mod)) +
  geom_point() +
  facet_wrap(~.metric, scales = "free_y") +
  theme_bw() +
  labs(
    x = "Model",
    y = "Wartosc metryki",
    colour = "Model"
  ) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p

Show and hide code
ggsave(
  filename = "rys_4_test_result_model.jpeg", device = "jpeg",
  width = 8.5, height = 3, dpi = 300
)

Testowy vs walidacyjny

Show and hide code
por_test_valid <-
  bind_rows(
  best_models |>
      map_dfr(\(x) collect_metrics(x = x), .id = "mod") |>
      select(mod, .metric, .estimate) |>
      rename(mean = .estimate) |>
      mutate(typ = "testowy"),
    combined_results |>
      rank_results(select_best = T) |>
      select(wflow_id, .metric, mean) |>
      rename(mod = wflow_id) |>
      mutate(typ = "walidacyjny")
  ) |>
  pivot_wider(
    names_from = typ,
    values_from = mean
  )

por_test_valid |>
  gt::gt()  |> 
  gt::fmt_number(n_sigfig = 3)
mod .metric testowy walidacyjny
bist_cubist rmse 2.61 2.59
bist_cubist rsq 0.231 0.220
bist_cubist mae 2.09 2.07
ranger rmse 2.66 2.65
ranger rsq 0.206 0.188
ranger mae 2.22 2.21
rpart rmse 2.61 2.61
rpart rsq 0.231 0.212
rpart mae 2.10 2.10
transformed_knn rmse 2.77 2.77
transformed_knn rsq 0.157 0.138
transformed_knn mae 2.33 2.32
transformed_mars rmse 2.61 2.59
transformed_mars rsq 0.231 0.218
transformed_mars mae 2.10 2.09
xgboost rmse 2.63 2.63
xgboost rsq 0.217 0.200
xgboost mae 2.20 2.18
Show and hide code
por_test_valid |>
  filter(.metric == "rmse") |> 
  ggplot(aes(walidacyjny, testowy, colour = mod)) +
  geom_point(size = 4) +
  theme_bw() +
  geom_abline(slope = 1) +
  labs(
    x = "Walidacyjny",
    y = "Testowy",
    colour = "Model"
  ) +
  coord_fixed(ratio = 1) +
  xlim(2.5, 3) + ylim(2.5, 3)

Show and hide code
por_test_valid |>
  filter(.metric == "rsq") |> 
  ggplot(aes(walidacyjny, testowy, colour = mod)) +
  geom_point(size = 4) +
  theme_bw() +
  geom_abline(slope = 1) +
  labs(
    x = "Walidacyjny",
    y = "Testowy",
    colour = "Model"
  ) +
  coord_fixed(ratio = 1) +
  expand_limits(x = c(0,0.25), y = c(0,0.25)) +
  scale_x_continuous(expand = c(0,0), breaks = seq(0, 0.25, 0.05)) +
  scale_y_continuous(expand = c(0,0), breaks = seq(0, 0.25, 0.05)) 

Show and hide code
por_test_valid |>
  filter(.metric == "mae") |> 
  ggplot(aes(walidacyjny, testowy, colour = mod)) +
  geom_point(size = 4) +
  theme_bw() +
  geom_abline(slope = 1) +
  labs(
    x = "Walidacyjny",
    y = "Testowy",
    colour = "Model"
  ) +
  coord_fixed(ratio = 1) +
  expand_limits(x = c(0,3), y = c(0,3)) +
  scale_x_continuous(expand = c(0,0), breaks = seq(0, 3, 0.5)) +
  scale_y_continuous(expand = c(0,0), breaks = seq(0, 3, 0.5)) 

Wykres rozrzutu

Show and hide code
my_breaks <- c(1, 4, 9, 16, 25, 36, 49, 64, 81, 100)

range_axis_x_y <-
  best_models |>
  map_dfr(
    \(x) augment(x = x),
    .id = "mod"
  ) |>
  summarise(max = max(max(total_price), max(.pred))) |>
  pull()

best_models |>
  map_dfr(
    \(x) augment(x = x),
    .id = "mod"
  ) |>
  ggplot(aes(total_price, .pred)) +
  geom_hex() +
  facet_wrap(~mod) +
  geom_abline(slope = rep(c(1.5, 0.5, 1), 4)) +
  theme_bw() +
  viridis::scale_fill_viridis(
    breaks = my_breaks,
    labels = my_breaks,
    option = "D",
    direction = 1,
    trans = scales::pseudo_log_trans(sigma = 1)
  ) +
  scale_x_continuous(expand = c(0, 0), limits = c(0, range_axis_x_y)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, range_axis_x_y)) +
  coord_obs_pred() +
  labs(
    x = expression("Cena całkowita"),
    y = expression("Cena całkowita"),
    fill = "Czestosc", title = "Ocena na zestawie testowym"
  )

Show and hide code
ggsave(
  filename = "fig_2_Best_models_for_test_data.jpeg",
  device = "jpeg",
  width = 8.00,
  height = 4.60,
  dpi = 300
)
Show and hide code
best_models |>
  map_dfr(
    \(x) augment(x = x),
    .id = "mod"
  ) |>
  openair::modStats(mod = ".pred", obs = "total_price", type = "mod") |>
  arrange(FAC2) |>
  select(-P) |>
  gt::gt() |>
  gt::fmt_number(columns = FAC2:IOA, n_sigfig = 3)
mod n FAC2 MB MGE NMB NMGE RMSE r COE IOA
rpart 1001 0.818 −0.00253 2.10 −0.000410 0.341 2.61 0.481 0.152 0.576
transformed_knn 1001 0.823 0.00317 2.33 0.000515 0.378 2.77 0.396 0.0591 0.530
ranger 1001 0.837 0.00369 2.22 0.000598 0.360 2.66 0.454 0.104 0.552
bist_cubist 1001 0.870 0.0279 2.09 0.00453 0.339 2.61 0.480 0.156 0.578
xgboost 1001 0.882 0.0296 2.20 0.00479 0.358 2.63 0.466 0.110 0.555
transformed_mars 1001 0.976 −0.00187 2.10 −0.000304 0.340 2.61 0.481 0.153 0.576
  • Najniższe RMSE: cubist
  • Najwyższe r: rpart, mars
  • Najwyższe IOA: cubist ( Index of Agreement based on Willmott et al. (2011), which spans between -1 and +1 with values approaching +1 representing better model performance)
  • Najwyższe FAC2: mars, wyjątkowo dobre oszacowanie, bliskie 1.


Cubist wypadł najlepiej, modele drzew również dobrze sobie radzą, modele transformacyjne działają słabiej.
Top3: cubist, rpart, mars